How to fit a Logistic Regression using Bayesian Methods
The advantage of using Bayes to overcome the Separation in Logistic
Regression
<- read.csv(file = 'data/HtWtData300.csv')
mydf head(mydf)
male height weight
1 0 64.0 136.4
2 0 62.3 215.1
3 1 67.9 173.6
4 0 64.2 117.3
5 0 64.8 123.3
6 0 57.5 96.5
Probability close to 0.5
table(mydf$male)
0 1
152 148
prop.table(table(mydf$male))
0 1
0.5066667 0.4933333
Firstly, we try to fit with glm
<- glm(male ~ height + weight,
fmod1 family = "binomial", data = mydf)
summary(fmod1)
Call:
glm(formula = male ~ height + weight, family = "binomial", data = mydf)
Deviance Residuals:
Min 1Q Median 3Q Max
-2.7751 -0.4674 -0.0165 0.3655 3.1906
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -59.994880 7.136907 -8.406 < 2e-16 ***
height 0.890858 0.108873 8.183 2.78e-16 ***
weight 0.005320 0.005225 1.018 0.309
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 415.83 on 299 degrees of freedom
Residual deviance: 189.49 on 297 degrees of freedom
AIC: 195.49
Number of Fisher Scoring iterations: 6
\(\Rightarrow\) We will see the estimates as same as the Bayesian method.
plot(mydf$height,
$weight,pch=16, col="blue")
mydfpoints(mydf$height[mydf$male==1],
$weight[mydf$male==1],
mydfcol="orange", pch=16)
legend("topleft", legend=c("Female","Male"),
col=c("blue","orange"), pch=16)
Model for Dichotomous y with multiple metric predictors set up in
Stan
But firstly, we look at the simplified modelString
of
Stan:
# Not eval
<- "
modelString data{
int<lower=0> N;
real xc[N];
int xb[N];
int<lower=0,upper=1> y[N]; //Binary outcome
}
parameters{
real b0;
real b1;
real b2;
}
model{
b0 ~ normal(0,1);
b1 ~ normal(0,1);
b2 ~ normal(0,1);
y ~ bernoulli_logit(b0 + b1*xc + b2*xb);
}
"
Then move on the destination one:
<- "
modelString data {
int<lower=1> Ntotal; // num of observations
int<lower=1> Nx; // num of predictors
int<lower=0, upper=1> y[Ntotal];
matrix[Ntotal, Nx] x;
}
transformed data {
vector[Nx] meanX;
vector[Nx] sdX;
matrix[Ntotal, Nx] zx; // normalized
for ( j in 1:Nx ) {
meanX[j] = mean(x[,j]);
sdX[j] = sd(x[,j]);
for (i in 1:Ntotal) {
zx[i,j] = (x[i,j] - meanX[j]) / sdX[j];
}
}
}
parameters {
real zbeta0;
vector[Nx] zbeta;
}
transformed parameters{
vector[Ntotal] mu;
mu = zbeta0 + zx * zbeta; // matrix product
}
model {
zbeta0 ~ normal(0, 2);
zbeta ~ normal(0, 2);
y ~ bernoulli_logit(mu);
}
generated quantities {
// Transform to original scale:
real beta0;
vector[Nx] beta;
// .* and ./ are element-wise product and division
beta0 = zbeta0 - sum(zbeta .* meanX ./ sdX);
beta = zbeta ./ sdX;
}
"
<- stan_model(model_code=modelString) stanDsoLogistic
Fit model
<- list(Ntotal = nrow(mydf),
heightWeightDataList y = mydf$male,
x = cbind(mydf$height, mydf$weight),
Nx = 2)
<- sampling(stanDsoLogistic,
fit data = heightWeightDataList,
pars = c('beta0', 'beta'),
iter = 5000, chains = 2, cores = 2
)
Analyze fitted model using shinystan
launch_shinystan(fit)
Analyze parameters.
stan_ac(fit, separate_chains = T)
pairs(fit)
plot(fit)
plot(fit,pars=c("beta"))
plot(fit,pars=c("beta[2]"))
summary(fit)$summary[,c(1,4,8)]
mean 2.5% 97.5%
beta0 -58.992924983 -7.323391e+01 -46.12530030
beta[1] 0.875179982 6.768256e-01 1.09355795
beta[2] 0.005549854 -4.832449e-03 0.01596438
lp__ -97.715598858 -1.009842e+02 -96.26790960
Parameter \(\beta_2\) is not significant with 95% HDI.
stan_dens(fit)
<-summary(fit)$summary[1:3,1] estimBetas
Plot the data with separating hyperplane.
plot(mydf$height, mydf$weight, pch=16, col="blue")
points(mydf$height[mydf$male==1], mydf$weight[mydf$male==1], col = "orange", pch = 16)
lines(mydf$height, -(estimBetas[1]+estimBetas[2]*mydf$height)/estimBetas[3])
legend("topleft", legend = c("Female","Male"), col = c("blue","orange"), pch=16)
Now try to remove almost all males from the sample to see what may happen when there are few 1’s observed.
In the original sample the proportion of males is:
mean(mydf$male)
[1] 0.4933333
Sample with females is
<- mydf[mydf$male == 0,] females
Select first 15 males.
<- mydf[mydf$male == 1,][1:15,] # just 15 males (originally was ~150)
males <- rbind(males,females)
mydf_sparse rownames(mydf_sparse) <- NULL
head(mydf_sparse, 20)
male height weight
1 1 67.9 173.6
2 1 70.2 191.1
3 1 71.1 193.9
4 1 66.5 127.1
5 1 75.1 204.4
6 1 64.6 143.4
7 1 69.2 124.4
8 1 68.1 140.9
9 1 72.6 164.7
10 1 71.5 193.6
11 1 76.0 180.0
12 1 69.7 155.0
13 1 73.3 188.2
14 1 68.3 178.6
15 1 70.8 207.3
16 0 64.0 136.4
17 0 62.3 215.1
18 0 64.2 117.3
19 0 64.8 123.3
20 0 57.5 96.5
Fit sparse model
<- glm(male ~ height + weight, family = "binomial", data = mydf_sparse)
fmod2 summary(fmod2)
Call:
glm(formula = male ~ height + weight, family = "binomial", data = mydf_sparse)
Deviance Residuals:
Min 1Q Median 3Q Max
-1.28353 -0.17117 -0.06829 -0.01696 3.15051
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -84.24302 20.03395 -4.205 2.61e-05 ***
height 1.25377 0.30728 4.080 4.50e-05 ***
weight -0.01190 0.01248 -0.953 0.34
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 100.909 on 166 degrees of freedom
Residual deviance: 35.582 on 164 degrees of freedom
AIC: 41.582
Number of Fisher Scoring iterations: 8
<-list(Ntotal=nrow(mydf_sparse),
heightWeightSparseDataListy=mydf_sparse$male,
x=cbind(mydf_sparse$height, mydf_sparse$weight),
Nx=2)
<- sampling(stanDsoLogistic,
fit_sparse data=heightWeightSparseDataList,
pars=c('beta0', 'beta'),
iter=5000, chains = 2, cores = 2
)
stan_ac(fit_sparse, separate_chains = T)
pairs(fit_sparse)
plot(fit_sparse)
plot(fit_sparse,pars=c("beta"))
plot(fit_sparse,pars=c("beta[2]"))
Compare summary of the two studies
rbind(beta0reg=summary(fit)$summary[1,c(1,3)],
beta0glm=c(summary(fmod1)$coefficients[1,1],summary(fmod1)$coefficients[1,2]*sqrt(dim(mydf)[1])),
beta0sparce=summary(fit_sparse)$summary[1,c(1,3)],
beta0sparceglm=c(summary(fmod2)$coefficients[1,1],summary(fmod2)$coefficients[1,2]*sqrt(dim(mydf_sparse)[1])))
mean sd
beta0reg -58.99292 6.931705
beta0glm -59.99488 123.614858
beta0sparce -65.45501 12.684797
beta0sparceglm -84.24302 258.895713
rbind(beta1reg=summary(fit)$summary[2,c(1,3)],
beta1glm=c(summary(fmod1)$coefficients[2,1],summary(fmod1)$coefficients[2,2]*sqrt(dim(mydf)[1])),
beta1sparce=summary(fit_sparse)$summary[2,c(1,3)],
beta1sparceglm=c(summary(fmod2)$coefficients[2,1],summary(fmod2)$coefficients[2,2]*sqrt(dim(mydf_sparse)[1])))
mean sd
beta1reg 0.8751800 0.1059241
beta1glm 0.8908578 1.8857272
beta1sparce 0.9665638 0.1937020
beta1sparceglm 1.2537728 3.9708903
rbind(beta2reg=summary(fit)$summary[3,c(1,3)],
beta2glm=c(summary(fmod1)$coefficients[3,1],summary(fmod1)$coefficients[3,2]*sqrt(dim(mydf)[1])),
beta2sparce=summary(fit_sparse)$summary[3,c(1,3)],
beta2sparceglm=c(summary(fmod2)$coefficients[3,1],summary(fmod2)$coefficients[3,2]*sqrt(dim(mydf_sparse)[1])))
mean sd
beta2reg 0.005549854 0.005325699
beta2glm 0.005319759 0.090499569
beta2sparce -0.007222053 0.010259493
beta2sparceglm -0.011900490 0.161304086
rbind(beta0reg=summary(fit)$summary[1,c(4,8)],
beta0sparce=summary(fit_sparse)$summary[1,c(4,8)])
2.5% 97.5%
beta0reg -73.23391 -46.12530
beta0sparce -92.80089 -42.89777
rbind(beta1reg=summary(fit)$summary[2,c(4,8)],
beta1sparce=summary(fit_sparse)$summary[2,c(4,8)])
2.5% 97.5%
beta1reg 0.6768256 1.093558
beta1sparce 0.6177309 1.379462
rbind(beta2reg=summary(fit)$summary[3,c(4,8)],
beta2sparce=summary(fit_sparse)$summary[3,c(4,8)])
2.5% 97.5%
beta2reg -0.004832449 0.01596438
beta2sparce -0.028192807 0.01230010
HDI of both slopes widened significantly in the sample with
more extreme disproportion.
Standard deviations of betas also increase dramatically,
especially fit with glm
.
Observe the data of the previous section.
Plot male (1) and female (0) groups with respect to weight.
plot(mydf$weight,mydf$male)
In the lower right corner there are some outliers representing heavy
females.
Such observations cause bias of model parameters.
plot(mydf$height,mydf$male)
On the plot relative to height outliers are short males.
What does a model robust to such outliers look like?
Robust logistic regression is a mix of “guessing model” and logistic model
\[\mu = \alpha \frac{1}{2} + (1 - \alpha) \ \text{logistic}\Big(\beta_0 + \sum_j \beta_j x_j\Big)\]
When \(\alpha = 0\) all observations fit within logistic regression pattern, i.e. the classes are separated well enough to be explained by logistic sigmoid function. In this case the model becomes pure logistic regression.
When \(\alpha = 1\) the classes overlap completely and the model can only guess with probability 0.5 to which class the observation belongs.
Typically \(\alpha\) is small (few
outliers showing that predictor points to wrong class).
Select beta distribution as a prior to \(\alpha\) with high concentration near zero:
dbeta(1,9).
<- seq(from=0,to=1,by=.01)
Argument plot(Argument, dbeta(Argument, 1, 9), type="l")
The modified model is on the .
=
modelString"data { // ROBUST LOGISTIC REGRESSION
int<lower=1> Ntotal; // num of observations
int<lower=1> Nx; // num of predictors
int<lower=0, upper=1> y[Ntotal];
matrix[Ntotal, Nx] x;
}
transformed data {
vector[Nx] meanX;
vector[Nx] sdX;
matrix[Ntotal, Nx] zx; // normalized
for ( j in 1:Nx ) {
meanX[j] = mean(x[,j]);
sdX[j] = sd(x[,j]);
for ( i in 1:Ntotal ) {
zx[i,j] = ( x[i,j] - meanX[j] ) / sdX[j];
}
}
}
parameters {
real zbeta0;
vector[Nx] zbeta;
real<lower=0,upper=1> guess; // mixture param
}
transformed parameters{
vector[Ntotal] mu;
for ( i in 1:Ntotal ) {
mu[i] = guess * (1/2.0) + (1-guess) * inv_logit(zbeta0 + zx[i,] * zbeta);
}
}
model {
zbeta0 ~ normal(0, 2);
zbeta ~ normal(0, 2);
guess ~ beta(1, 9);
y ~ bernoulli(mu);
}
generated quantities {
// Transform to original scale:
real beta0;
vector[Nx] beta;
// .* and ./ are element-wise product and division
beta0 = zbeta0 - sum( zbeta .* meanX ./ sdX );
beta = zbeta ./ sdX;
}
"
<- stan_model(model_code=modelString) stanDsoRobustLogistic
Run robust MCMC with the hight/weight data.
<- sampling(stanDsoRobustLogistic,
fitRobust data=heightWeightDataList,
pars=c('beta0', 'beta', 'guess'),
iter=5000, chains = 2, cores = 2)
Analyze results.
stan_ac(fitRobust, separate_chains = T)
pairs(fitRobust)
plot(fitRobust)
plot(fitRobust,pars=c("beta[1]"))
plot(fitRobust,pars=c("beta[2]"))
plot(fitRobust,pars=c("guess"))
rbind(summary(fitRobust)$summary[,c(1,4,7)],
summary(fit)$summary[,c(1,4,8)]
)
mean 2.5% 75%
beta0 -6.794151e+01 -9.040257e+01 -60.86879681
beta[1] 1.004011e+00 7.406505e-01 1.09734128
beta[2] 7.843206e-03 -4.273673e-03 0.01185354
guess 3.618337e-02 2.272841e-03 0.05068176
lp__ -1.019951e+02 -1.056117e+02 -100.89209843
beta0 -5.899292e+01 -7.323391e+01 -46.12530030
beta[1] 8.751800e-01 6.768256e-01 1.09355795
beta[2] 5.549854e-03 -4.832449e-03 0.01596438
lp__ -9.771560e+01 -1.009842e+02 -96.26790960
Note positive correlation between guess and the slope \(\beta_1\). The more outliers the higher is the slope.
Since \(\beta_2\) does not seem significant. Now, we fit both models with height as only predictor.
<-list(Ntotal=nrow(mydf),
heightWeightDataListy=mydf$male,
x=cbind(mydf$height),
Nx=1)
<- sampling(stanDsoLogistic,
fit data=heightWeightDataList,
pars=c('beta0', 'beta'),
iter=5000, chains = 2, cores = 2
)<- sampling(stanDsoRobustLogistic,
fitRobust data=heightWeightDataList,
pars=c('beta0', 'beta', 'guess'),
iter=5000, chains = 2, cores = 2
)pairs(fit)
pairs(fitRobust)
plot(fit)
plot(fit,pars=c("beta"))
plot(fitRobust)
plot(fitRobust,pars=c("beta[1]","guess"))
plot(fitRobust,pars=c("guess"))
rbind(summary(fitRobust)$summary[,c(1,4,7)],
summary(fit)$summary[,c(1,4,8)]
)
mean 2.5% 75%
beta0 -66.88526189 -8.866638e+01 -60.00415120
beta[1] 1.00688726 7.605573e-01 1.09231329
guess 0.03243383 1.645023e-03 0.04555081
lp__ -102.24849261 -1.055917e+02 -101.29745249
beta0 -59.40455205 -7.350401e+01 -46.79642504
beta[1] 0.89464752 7.033695e-01 1.10703605
lp__ -97.75056275 -1.006445e+02 -96.76446560
Compare probabilities predicted by logistic regression and robust logistic regression.
# Coefficients
<-summary(fitRobust)$summary[1,1]
meanBeta0Robust<-summary(fitRobust)$summary[2,1]
meanBeta1Robust<-summary(fitRobust)$summary[3,1]
guess<-summary(fit)$summary[1,1]
meanBeta0<-summary(fit)$summary[2,1]
meanBeta1
#Linear predictors and probabilities
<-meanBeta0Robust+meanBeta1Robust*mydf$height
linPredRobust_Male.Height<-guess/2+(1-guess)*exp(linPredRobust_Male.Height)/(1+exp(linPredRobust_Male.Height))
pRobustMail_height<-meanBeta0+meanBeta1*mydf$height
linPred_Male.Height<-exp(linPred_Male.Height)/(1+exp(linPred_Male.Height))
pMail_height
# Plot
plot(mydf$height,mydf$male,pch=16)
points(mydf$height,pRobustMail_height,col="orange",pch=16)
points(mydf$height,pMail_height,col="cyan",pch=16)
legend("topleft",
legend=c("Actual","Prob Logistic","Prob. Robust"),
col=c("black","cyan","orange"), pch=16)
Logistic probability is more extreme: higher in the area of
low-height male observations because it is biased by an outlier of short
male below the average female height. But the far tails are closer to
0.
Robust logistic regression did not react to this observation as
much!!
Robust model does not produce probabilities too close to 0 and 1.
Repeat the same comparison with the sparse data.
Create data list with one predictor.
<- list(Ntotal = nrow(mydf_sparse),
heightWeightSparseDataList y = mydf_sparse$male,
x = cbind(mydf_sparse$height),
Nx = 1)
Fit both models with only one predictor height
<- sampling(stanDsoLogistic,
fitSparse data=heightWeightSparseDataList,
pars=c('beta0', 'beta'),
iter=5000, chains = 2, cores = 2)
<- sampling(stanDsoRobustLogistic,
fitSparseRobust data=heightWeightSparseDataList,
pars=c('beta0', 'beta', 'guess'),
iter=5000, chains = 2, cores = 2)
Analyze the models.
pairs(fitSparse)
pairs(fitSparseRobust)
plot(fitSparse)
plot(fitSparse,pars=c("beta"))
plot(fitSparseRobust)
plot(fitSparseRobust,pars=c("beta[1]","guess"))
plot(fitSparseRobust,pars=c("guess"))
rbind(summary(fitSparseRobust)$summary[,c(1,4,7)],
summary(fitSparse)$summary[,c(1,4,8)]
)
mean 2.5% 75%
beta0 -65.21124168 -92.353657075 -56.21273405
beta[1] 0.94557811 0.613894602 1.06360012
guess 0.01666415 0.000493968 0.02404738
lp__ -28.82697378 -32.226208453 -27.88666403
beta0 -62.74745106 -90.025601491 -40.85149755
beta[1] 0.90990203 0.584875829 1.31576470
lp__ -23.23831969 -26.128614012 -22.20502250
Make plot of probabilities.
# Coefficients
<-summary(fitSparseRobust)$summary[1,1]
meanBeta0Robust<-summary(fitSparseRobust)$summary[2,1]
meanBeta1Robust<-summary(fitSparseRobust)$summary[3,1]
guess<-summary(fitSparse)$summary[1,1]
meanBeta0<-summary(fitSparse)$summary[2,1]
meanBeta1
#Linear predictors and probabilities
<-meanBeta0Robust+meanBeta1Robust*mydf_sparse$height
linPredRobust_Male.Height<-guess/2+(1-guess)*exp(linPredRobust_Male.Height)/(1+exp(linPredRobust_Male.Height))
pRobustMail_height<-meanBeta0+meanBeta1*mydf_sparse$height
linPred_Male.Height<-exp(linPred_Male.Height)/(1+exp(linPred_Male.Height))
pMail_height
# Plot
plot(mydf_sparse$height,mydf_sparse$male,pch=16)
points(mydf_sparse$height,pRobustMail_height,col="orange",pch=16)
points(mydf_sparse$height,pMail_height,col="cyan",pch=16)
legend("topleft",
legend=c("Actual","Prob Logistic","Prob. Robust"),
col=c("black","cyan","orange"),pch=16)
This data example is from library DAAG.
Thirty patients were given an anesthetic agent maintained at a
predetermined concentration level (conc
) for 15 minutes
before making an incision. It was then noted whether the patient moved
(1), i.e. jerked or twisted.
library(DAAG)
head(anesthetic)
move conc logconc nomove
1 0 1.0 0.0000000 1
2 1 1.2 0.1823216 0
3 0 1.4 0.3364722 1
4 1 1.4 0.3364722 0
5 1 1.2 0.1823216 0
6 0 2.5 0.9162907 1
Use column move
as response and column
logconc
as predictor.
Prepare the data.
<- list(Ntotal=nrow(anesthetic),
dataListAnesthetic y=anesthetic$move,
x=cbind(anesthetic$logconc),
Nx=1)
glm()
<- glm(move ~ logconc,
logRegr data=anesthetic,
family="binomial")
summary(logRegr)
Call:
glm(formula = move ~ logconc, family = "binomial", data = anesthetic)
Deviance Residuals:
Min 1Q Median 3Q Max
-2.1477 -0.6962 -0.1209 0.7586 1.7528
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 0.8077 0.5709 1.415 0.15715
logconc -6.2457 2.2618 -2.761 0.00575 **
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 41.455 on 29 degrees of freedom
Residual deviance: 28.007 on 28 degrees of freedom
AIC: 32.007
Number of Fisher Scoring iterations: 5
<- predict(logRegr, type="response") predLogRegr
Run MCMC using logistic and robust logistic models.
<- sampling(stanDsoLogistic,
fitAnesth data=dataListAnesthetic,
pars=c('beta0', 'beta'),
iter=5000, chains = 2, cores = 2
)
<- sampling(stanDsoRobustLogistic,
fitRobustAnesth data=dataListAnesthetic,
pars=c('beta0', 'beta', 'guess'),
iter=5000, chains = 2, cores = 2)
Extract diagnostics
summary(fitAnesth)$summary[,c(1,4,8:10)]
mean 2.5% 97.5% n_eff Rhat
beta0 0.8195786 -0.1744736 1.944504 3161.224 1.0004414
beta[1] -6.1980944 -10.7384617 -2.545248 2779.034 0.9997342
lp__ -15.4770746 -18.1920124 -14.472321 2004.009 1.0012005
stan_ac(fitAnesth, separate_chains = T)
stan_trace(fitAnesth)
pairs(fitAnesth,pars=c("beta0","beta"))
plot(fitAnesth)
summary(fitRobustAnesth)$summary[,c(1,4,8:10)]
mean 2.5% 97.5% n_eff Rhat
beta0 1.03390884 -0.204622023 2.7745487 2884.167 1.0002948
beta[1] -7.38947230 -14.369144436 -2.7431485 2649.559 1.0022518
guess 0.09805441 0.003237798 0.2942556 3631.455 0.9997073
lp__ -19.53242382 -22.933897477 -18.0116805 1675.079 1.0019349
stan_ac(fitRobustAnesth, separate_chains = T)
stan_trace(fitRobustAnesth)
pairs(fitRobustAnesth,pars=c("beta0","beta","guess"))
plot(fitRobustAnesth)
plot(fitRobustAnesth,pars=c("guess"))
Parameter guess is almost \(10%\).
This means that there should be a significant difference between the two
models.
However, 95% HDI almost covers \(0\).
Compare intercepts.
rbind(Logistic=summary(fitAnesth)$summary[1,c(1,4,8)],
Robust=summary(fitRobustAnesth)$summary[1,c(1,4,8)])
mean 2.5% 97.5%
Logistic 0.8195786 -0.1744736 1.944504
Robust 1.0339088 -0.2046220 2.774549
Compare slopes.
rbind(Logistic=summary(fitAnesth)$summary[2,c(1,4,8)],
Robust=summary(fitRobustAnesth)$summary[2,c(1,4,8)])
mean 2.5% 97.5%
Logistic -6.198094 -10.73846 -2.545248
Robust -7.389472 -14.36914 -2.743148
Compare probabilities.
# Coefficients
<-summary(fitRobustAnesth)$summary[1,1]
meanBeta0Robust<-summary(fitRobustAnesth)$summary[2,1]
meanBeta1Robust<-summary(fitRobustAnesth)$summary[3,1]
guess<-summary(fitAnesth)$summary[1,1]
meanBeta0<-summary(fitAnesth)$summary[2,1]
meanBeta1
#Linear predictors and probabilities
<-meanBeta0Robust+meanBeta1Robust*anesthetic$logconc
linPredRobust_Move<-guess/2+(1-guess)*exp(linPredRobust_Move)/(1+exp(linPredRobust_Move))
pRobustMove<-meanBeta0+meanBeta1*anesthetic$logconc
linPred_Move<-exp(linPred_Move)/(1+exp(linPred_Move))
pMove
# Plot
plot(anesthetic$logconc,anesthetic$move,pch=16)
points(anesthetic$logconc,pRobustMove,col="orange",pch=15)
points(anesthetic$logconc,pMove,col="cyan",pch=17)
points(anesthetic$logconc,predLogRegr,col="purple",pch=25)
legend("topright",
legend=c("Actual","Prob Logistic","Prob. Robust","Glm"),
col=c("black","cyan","orange","purple"),pch=c(16,17,15,25))
Again, robust method does not return extreme probabilities.
Softmax (or multinomial logistic regression) is a generalization of logistic regression to the case where we want to handle multiple classes. In logistic regression, the response was binary: \(y^{(i)} \in \{0,1\}\), meanwhile softmax handles \(y^{(i)} \in \{1, ..., K\}\)
It is based on modification of odds ratio from
\[\frac{p}{1-p}\] to \[\frac{p_s}{p_r},\]
where \(p_r\) is probability of one of
several classes selected as reference (for example, control group).
The model changes very little from logistic regression and is shown
on this diagram.
Because adding constants to coefficients does not change softmax the reference class coefficients are assumed equal to zero.
The data SoftmaxRegData1.csv
are from Kruschke, 2015
chapter 22.
<- read.csv(file="data/SoftmaxRegData2.csv" )
myData head(myData)
X1 X2 Y
1 -0.08714736 -1.0813422 2
2 -0.72256565 -1.5838631 2
3 0.17918961 0.9717904 3
4 -1.15975176 0.5026244 3
5 -0.72711762 1.3757045 3
6 0.53341559 1.7746506 3
table(myData$Y, useNA = "always")
1 2 3 4 <NA>
58 145 139 133 0
<-myData$Y==2
idx2<-myData$Y==3
idx3<-myData$Y==4
idx4
plot(myData$X1,myData$X2,pch=16)
points(myData$X1[idx2],myData$X2[idx2],pch=16,col="orange")
points(myData$X1[idx3],myData$X2[idx3],pch=16,col="cyan")
points(myData$X1[idx4],myData$X2[idx4],pch=16,col="magenta")
Prepare data list
<- list(N=nrow(myData), # num of observations
dataListSoftmax K=max(myData$Y), # num of groups
y=myData$Y,
x=cbind(x1 = myData$X1, x2 = myData$X2),
D=2) # num of predictiors
Describe the model.
="
modelStringdata {
int<lower=2> K; // num of groups
int<lower=0> N; // num of observations
int<lower=1> D; // num of predictors
int<lower=1,upper=K> y[N];
matrix[N, D] x;
}
transformed data {
row_vector[D] zeros;
row_vector[D] x_m; // x means
row_vector[D] x_sd; // x standard deviations
matrix[N, D] zx; // normalized x
zeros = rep_row_vector(0, D); // coefficients are zeros for the baseline class
for (j in 1:D) {
x_m[j] = mean(x[,j]);
x_sd[j] = sd(x[,j]);
zx[,j] = (x[,j] - x_m[j]) / x_sd[j];
}
}
parameters {
matrix[K-1,D] zbeta_raw; // K-1 makes model identifiable
vector[K-1] zbeta0_raw;
}
transformed parameters {
vector[K] zbeta0; // intersection coeffs
matrix[K, D] zbeta; // predictor coeffs
zbeta0 = append_row(0, zbeta0_raw);
zbeta = append_row(zeros, zbeta_raw); // add zeros for coefficients of the baseclass
}
model {
zbeta0_raw ~ normal(0, 5);
for (k in 1:(K-1))
zbeta_raw[k,] ~ normal(0, 5);
for (n in 1:N)
y[n] ~ categorical(softmax(zbeta0 + zbeta * to_vector(zx[n,]) ));
}
generated quantities {
vector[K] beta0;
matrix[K, D] beta;
// transform zbetas to original betas:
for (k in 1:K) {
beta0[k] = zbeta0[k];
for (j in 1:D) {
beta0[k] = beta0[k] - zbeta[k,j] * x_m[j] / x_sd[j];
beta[k,j] = zbeta[k,j] / x_sd[j];
}
}
}
"
Create DSO.
<- stan_model(model_code=modelString) modelSoftmax
<- sampling(modelSoftmax,
fit data=dataListSoftmax,
pars=c('beta0', 'beta'),
iter=5000, chains = 2, cores = 2)
Analyze fitted model using shinystan
, but check directly
for purposely demonstration.
summary(fit)$summary[,c(1,4,8:10)]
mean 2.5% 97.5% n_eff Rhat
beta0[1] 0.0000000 0.000000 0.0000000 NaN NaN
beta0[2] -3.8862330 -5.342788 -2.6322714 2248.385 1.0004666
beta0[3] -3.4331047 -4.773270 -2.2796907 3296.605 0.9997387
beta0[4] -3.8265114 -5.189532 -2.6072841 3164.843 0.9996566
beta[1,1] 0.0000000 0.000000 0.0000000 NaN NaN
beta[1,2] 0.0000000 0.000000 0.0000000 NaN NaN
beta[2,1] -5.5316729 -7.270050 -4.0375865 2447.257 1.0004653
beta[2,2] -5.3960430 -7.160777 -3.8669145 2404.709 1.0003155
beta[3,1] -1.4401162 -2.480155 -0.4521930 3834.628 0.9997128
beta[3,2] 8.7333497 6.723432 11.0850555 2888.872 0.9999173
beta[4,1] 8.1774008 6.235139 10.3564345 3186.626 0.9996173
beta[4,2] -0.7379993 -1.678778 0.2006212 3773.177 0.9999730
lp__ -123.4277181 -128.596525 -120.1976451 1642.521 1.0026270
The model has 4 classes and 2 predictors. The returned coefficients form a matrix \[\lambda_{i,1} = \beta_{0,1} + \beta{1,1} x_{i,1} + \beta_{2,1} x_{i,2} \\ \lambda_{i,2} = \beta_{0,2} + \beta{1,2} x_{i,1} + \beta_{2,2} x_{i,2} \\ \lambda_{i,3} = \beta_{0,3} + \beta{1,3} x_{i,1} + \beta_{2,31} x_{i,2} \\ \lambda_{i,4} = \beta_{0,4} + \beta{1,4} x_{i,1} + \beta_{2,4} x_{i,2}\]
In this system first class is selected as reference, so \(\beta_{0,1} = \beta_{1,1} = \beta_{2,1} = 0\).
stan_ac(fit, separate_chains = T)
stan_trace(fit)
pairs(fit,pars=c("beta0"))
pairs(fit,pars=c("beta"))
plot(fit)
To predict classes use formula for probability of class \(k\) \[p_k = \frac{e^{\lambda_k}}{\sum_s e^{\lambda_s}}.\]
Create matrix of coefficients.
<-summary(fit)$summary[1:12,c(1)]
SoftmaxCoeff<-cbind(SoftmaxCoeff[1:4],matrix(SoftmaxCoeff[-(1:4)],ncol=2,byrow=T))
SoftmaxCoeffrownames(SoftmaxCoeff)<-paste0("Class",1:4)
SoftmaxCoeff
[,1] [,2] [,3]
Class1 0.000000 0.000000 0.0000000
Class2 -3.886233 -5.531673 -5.3960430
Class3 -3.433105 -1.440116 8.7333497
Class4 -3.826511 8.177401 -0.7379993
Create linear predictors.
head(myData)
X1 X2 Y
1 -0.08714736 -1.0813422 2
2 -0.72256565 -1.5838631 2
3 0.17918961 0.9717904 3
4 -1.15975176 0.5026244 3
5 -0.72711762 1.3757045 3
6 0.53341559 1.7746506 3
<-apply(SoftmaxCoeff[,-1],1,function(z) z%*%t(myData[,1:2]))
linPredictorsdim(linPredictors)
[1] 475 4
head(linPredictors)
Class1 Class2 Class3 Class4
[1,] 0 6.317040 -9.318237 0.08539082
[2,] 0 12.543590 -12.791852 -4.73981916
[3,] 0 -6.235041 8.228932 0.74812465
[4,] 0 3.703185 6.059772 -9.85469145
[5,] 0 -3.401184 13.061642 -6.96120110
[6,] 0 -12.526772 14.730464 3.05226225
<-t(apply(linPredictors,1,function(z) z+SoftmaxCoeff[,1]))
linPredictorsdim(linPredictors)
[1] 475 4
head(linPredictors)
Class1 Class2 Class3 Class4
[1,] 0 2.4308066 -12.751342 -3.7411206
[2,] 0 8.6573571 -16.224956 -8.5663305
[3,] 0 -10.1212743 4.795827 -3.0783867
[4,] 0 -0.1830483 2.626667 -13.6812028
[5,] 0 -7.2874165 9.628537 -10.7877125
[6,] 0 -16.4130045 11.297359 -0.7742491
Check calculation for the first row of the data and second class.
<-myData[1,]
row1<-SoftmaxCoeff[2,1]+SoftmaxCoeff[2,2]*row1[1]+SoftmaxCoeff[2,3]*row1[2]
Class2c(Class2,linPredictors[1,2])
$X1
[1] 2.430807
$Class2
[1] 2.430807
Create probabilities
<-exp(linPredictors)/apply(exp(linPredictors),1,sum)
softmaxProbapply(head(softmaxProb),1,sum)
[1] 1 1 1 1 1 1
Predict classes.
<-apply(softmaxProb,1,which.max)
predClasshead(predClass)
[1] 2 2 3 3 3 3
Plot predicted classes and compare them with the data.
<-predClass==2
idx2Pred<-predClass==3
idx3Pred<-predClass==4
idx4Pred
par(mfrow=c(1,2))
plot(myData$X1,myData$X2,pch=16)
points(myData$X1[idx2],myData$X2[idx2],pch=16,col="orange")
points(myData$X1[idx3],myData$X2[idx3],pch=16,col="cyan")
points(myData$X1[idx4],myData$X2[idx4],pch=16,col="magenta")
plot(myData$X1,myData$X2,pch=16)
points(myData$X1[idx2Pred],myData$X2[idx2Pred],pch=16,col="orange")
points(myData$X1[idx3Pred],myData$X2[idx3Pred],pch=16,col="cyan")
points(myData$X1[idx4Pred],myData$X2[idx4Pred],pch=16,col="magenta")
par(mfrow=c(1,1))
See how different classes are separated by hyperplanes.
Add hyperplane between class 1 and class 2:
plot(myData$X1,myData$X2,pch=16)
points(myData$X1[idx2Pred],myData$X2[idx2Pred],pch=16,col="orange")
points(myData$X1[idx3Pred],myData$X2[idx3Pred],pch=16,col="cyan")
points(myData$X1[idx4Pred],myData$X2[idx4Pred],pch=16,col="magenta")
lines(myData$X1,-(SoftmaxCoeff[2,1]+SoftmaxCoeff[2,2]*myData$X1)/SoftmaxCoeff[2,3],col="grey")
Add hyperplane between class 1 and class 3.
plot(myData$X1,myData$X2,pch=16)
points(myData$X1[idx2Pred],myData$X2[idx2Pred],pch=16,col="orange")
points(myData$X1[idx3Pred],myData$X2[idx3Pred],pch=16,col="cyan")
points(myData$X1[idx4Pred],myData$X2[idx4Pred],pch=16,col="magenta")
lines(myData$X1,-(SoftmaxCoeff[2,1]+SoftmaxCoeff[2,2]*myData$X1)/SoftmaxCoeff[2,3],col="grey")
lines(myData$X1,-(SoftmaxCoeff[3,1]+SoftmaxCoeff[3,2]*myData$X1)/SoftmaxCoeff[3,3],col="grey")
Add hyperplane between class 1 and class 4.
plot(myData$X1,myData$X2,pch=16)
points(myData$X1[idx2Pred],myData$X2[idx2Pred],pch=16,col="orange")
points(myData$X1[idx3Pred],myData$X2[idx3Pred],pch=16,col="cyan")
points(myData$X1[idx4Pred],myData$X2[idx4Pred],pch=16,col="magenta")
lines(myData$X1,-(SoftmaxCoeff[2,1]+SoftmaxCoeff[2,2]*myData$X1)/SoftmaxCoeff[2,3],col="grey")
lines(myData$X1,-(SoftmaxCoeff[3,1]+SoftmaxCoeff[3,2]*myData$X1)/SoftmaxCoeff[3,3],col="grey")
lines(myData$X1,-(SoftmaxCoeff[4,1]+SoftmaxCoeff[4,2]*myData$X1)/SoftmaxCoeff[4,3],col="grey")
If you see mistakes or want to suggest changes, please create an issue on the source repository.
Text and figures are licensed under Creative Commons Attribution CC BY 4.0. Source code is available at https://github.com/hai-mn/hai-mn.github.io, unless otherwise noted. The figures that have been reused from other sources don't fall under this license and can be recognized by a note in their caption: "Figure from ...".
For attribution, please cite this work as
Nguyen (2022, March 20). HaiBiostat: Series 7 -- Fitting a Logistic Regression in Bayes. Retrieved from https://hai-mn.github.io/posts/2022-03-20-Bayesian methods - Series 7 of 10/
BibTeX citation
@misc{nguyen2022series, author = {Nguyen, Hai}, title = {HaiBiostat: Series 7 -- Fitting a Logistic Regression in Bayes}, url = {https://hai-mn.github.io/posts/2022-03-20-Bayesian methods - Series 7 of 10/}, year = {2022} }